In [1]:
# Import libraries
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from sentiment import remove_punctuation, word_lemmatizer, sentiment
In [2]:
# Import White House data
WhiteHouse = pd.read_excel('D:\\Dropbox\\Research\\China Foreign Share Discount\\White House\\WhiteHouse.xlsx', parse_dates=['Date'])
In [3]:
# Combine title and content
WhiteHouse['Text'] = WhiteHouse['Title'] + WhiteHouse['Content']
In [4]:
# Keep Chinese related statements
WhiteHouse_china = WhiteHouse.loc[WhiteHouse['Text'].str.contains('China|Chinese|CHINA|CHINESE', regex=True)]
In [5]:
# tokenizer
tokenizer = RegexpTokenizer(r'\w+')
WhiteHouse_china['Text'] = WhiteHouse_china['Text'].apply(lambda x: tokenizer.tokenize(x.lower()))
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\1760722837.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WhiteHouse_china['Text'] = WhiteHouse_china['Text'].apply(lambda x: tokenizer.tokenize(x.lower()))
In [6]:
# Remove stopwords
WhiteHouse_china['Text'] = WhiteHouse_china['Text'].apply(lambda x: word_lemmatizer(x))
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2850851681.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WhiteHouse_china['Text'] = WhiteHouse_china['Text'].apply(lambda x: word_lemmatizer(x))
In [7]:
# sentiment
positive = []
negative = []
positive_lm = []
negative_lm = []
wordcount = []
In [8]:
for statement in WhiteHouse_china['Text']:
    x1,y1,x2,y2,z = sentiment(statement)
    positive.append(x1)
    negative.append(y1)
    positive_lm.append(x2)
    negative_lm.append(y2)
    wordcount.append(z)
In [9]:
WhiteHouse_china['pos'] = positive
WhiteHouse_china['neg'] = negative
WhiteHouse_china['pos_lm'] = positive_lm
WhiteHouse_china['neg_lm'] = negative_lm
WhiteHouse_china['wordcount'] = wordcount
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WhiteHouse_china['pos'] = positive
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WhiteHouse_china['neg'] = negative
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WhiteHouse_china['pos_lm'] = positive_lm
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WhiteHouse_china['neg_lm'] = negative_lm
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_15100\2668870678.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WhiteHouse_china['wordcount'] = wordcount
In [10]:
WhiteHouse_china = WhiteHouse_china[['Date', 'pos', 'neg', 'pos_lm', 'neg_lm', 'wordcount']]
WhiteHouse_china.to_csv("sentiment_WhiteHouse.csv")